install.packages("udpipe")
library(udpipe)
library(dplyr)
library(ggplot2)
library(lubridate)

ud_model <- udpipe_download_model(language = "english")
ud_model <- udpipe_load_model(ud_model$file_model)

# Create a unique identifier for each tweet
unique_combined_data <- unique_combined_data %>%
  mutate(doc_id = row_number())

# Annotate text with POS tagging
annotated_data <- udpipe_annotate(ud_model, x = unique_combined_data$text, doc_id = unique_combined_data$doc_id)
annotated_df <- as.data.frame(annotated_data)

proper_nouns <- annotated_df %>%
  filter(upos == "PROPN")

proper_noun_counts <- proper_nouns %>%
  inner_join(unique_combined_data, by = "doc_id") %>%
  count(account = screen_name, name = "proper_nouns")

# Convert doc_id to character in unique_combined_data
unique_combined_data <- unique_combined_data %>%
  mutate(doc_id = as.character(doc_id))

# Filtering for proper nouns
proper_nouns <- annotated_df %>%
  filter(upos == "PROPN")

# Joining and counting proper nouns
proper_noun_counts <- proper_nouns %>%
  inner_join(unique_combined_data, by = "doc_id") %>%
  count(account = screen_name, name = "proper_nouns")

str(annotated_df)


proper_nouns <- annotated_df %>%
  filter(upos == "PROPN") %>%
  group_by(doc_id) %>%
  distinct(token, .keep_all = TRUE)

# Ensure doc_id is character in both data frames
proper_nouns <- proper_nouns %>%
  mutate(doc_id = as.character(doc_id))

unique_combined_data <- unique_combined_data %>%
  mutate(doc_id = as.character(doc_id))

# Join with unique_combined_data
proper_nouns_joined <- proper_nouns %>%
  inner_join(unique_combined_data, by = "doc_id")

proper_nouns_monthly <- proper_nouns_joined %>%
  group_by(account = screen_name, month = floor_date(created, "month")) %>%
  summarise(proper_noun_count = n(), .groups = 'drop')

ggplot(proper_nouns_monthly, aes(x = month, y = proper_noun_count, color = account, linetype = account)) +
  geom_line() +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black"), 
                     name = "Twitter Account") +
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed"),
                        name = "Twitter Account") +
  labs(title = "Monthly Unique Proper Noun Counts by Twitter Account",
       x = "Month",
       y = "Count of Unique Proper Nouns") + 
  theme_minimal()

ggplot(proper_nouns_monthly, aes(x = month, y = proper_noun_count, color = account, linetype = account)) +
  geom_line() +
  geom_vline(xintercept = as.Date("2020-03-01"), color = "red", linetype = "solid") +
  geom_text(aes(x = as.Date("2020-03-01"), y = max(proper_noun_count, na.rm = TRUE),
                label = "COVID-19 start"), vjust = -0.5, color = "red", size = 3) +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black"), name = "Twitter Account") +
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed"), name = "Twitter Account") +
  labs(title = "Monthly Unique Proper Noun Counts by Twitter Account",
       x = "Month",
       y = "Count of Unique Proper Nouns") +
  theme_minimal()
